#' Clear environment.
## ----------------------------------------------------------------------------------------------------------------------------
remove(list = ls())

#' 
#' Load packages
## ----------------------------------------------------------------------------------------------------------------------------
library(tidyverse)
library(tidycensus)
library(readxl)

# set directories
source("./Code/set_directories.R")

#' 
#' 
#' ## ####################################### ##
#' ## Downloading block data using tidycensus ##
#' ## ####################################### ##
#' 
#' Add Census API key to the environment.
## ----------------------------------------------------------------------------------------------------------------------------
census_api_key("YOUR_API_KEY_GOES_HERE", overwrite = TRUE, install = TRUE)

#' 
#' We can get our ACS 2018 Variables here: https://api.census.gov/data/2018/acs/acs5/variables.html
#' tidycensus documentation here: https://cran.r-project.org/web/packages/tidycensus/tidycensus.pdf
#' tidycensus geographies here: https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus
#' 
#' 
#' Load State FIPS Codes and then create our state list to be pulled through the Census API.
#' Website: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/?cid=nrcs143_013696
## ----------------------------------------------------------------------------------------------------------------------------
state.list <- read.csv(file.path(DATA.PATH,"stateFIPS.txt"))
state.list <- c(state.list$PostalCode)
state.list <- state.list[!(state.list %in% c("AK","HI","AS","GU","MP","PR","VI"))]

#' 
#' Define variables to pull through API.
## ----------------------------------------------------------------------------------------------------------------------------
block.vars <- c("B02001_001E","B02001_002E","B02001_003E","B02001_004E","B02001_005E","B02001_006E","B02001_007E","B02001_008E","B03002_003E","B03002_012E","B11001_001E","B11001A_001E","B11001B_001E","B11001C_001E","B11001D_001E","B11001E_001E","B11001F_001E","B11001G_001E","B11001H_001E","B11001I_001E","B19013_001E","B19013A_001E","B19013B_001E","B19013C_001E","B19013D_001E","B19013E_001E","B19013F_001E","B19013G_001E","B19013H_001E","B19013I_001E")

#' Note -
#' B02001_001E  : Total Population "Number.POP"
#' B02001_002E  : Population of White Alone "Number.POP.white"
#' B02001_003E  : Population of Black Alone "Number.POP.black"
#' B02001_004E  : Population of American Indian Alone "Number.POP.ai"
#' B02001_005E  : Population of Asian Alone "Number.POP.asian"
#' B02001_006E  : Population of Pacific Islander Alone "Number.POP.pi"
#' B02001_007E  : Population of Other Race "Number.POP.other"
#' B02001_008E  : Population of Multi Racial "Number.POP.multi"
#' B03002_003E  : Population of White Alone (not Latino) "Number.POP.whitealone"
#' B03002_012E  : Population of Latino (any race) "Number.POP.latino"
#' B11001_001E  : Number of Households "Number.HH"
#' B11001A_001E : Number HHs of White Alone "Number.HH.white"
#' B11001B_001E : Number HHs of Black Alone "Number.HH.black"
#' B11001C_001E : Number HHs of American Indian Alone "Number.HH.ai"
#' B11001D_001E : Number HHs of Asian Alone "Number.HH.asian"
#' B11001E_001E : Number HHs of Pacific Islander Alone "Number.HH.pi"
#' B11001F_001E : Number HHs of Other Race "Number.HH.other"
#' B11001G_001E : Number HHs of Multi Racial "Number.HH.multi"
#' B11001H_001E : Number HHs of White Alone (not Latino) "Number.HH.whitealone"
#' B11001I_001E : Number HHs of Latino (any race) "Number.HH.latino"
#' B19013_001E  : Median HH income for all HH "Median.HH"
#' B19013A_001E : Median HH income for White Alone "Median.HH.white"
#' B19013B_001E : Median HH income for Black Alone "Median.HH.black"
#' B19013C_001E : Median HH income for American Indian Alone "Median.HH.ai"
#' B19013D_001E : Median HH income for Asian Alone "Median.HH.asian"
#' B19013E_001E : Median HH income for Pacific Islander Alone "Median.HH.pi"
#' B19013F_001E : Median HH income for Other Race Alone "Median.HH.other"
#' B19013G_001E : Median HH income for Multi Racial Alone "Median.HH.multi"
#' B19013H_001E : Median HH income for White Alone (not Latino) "Median.HH.whitealone"
#' B19013I_001E : Median HH income for Latino (any race) "Median.HH.latino"
#' 
#' Create Block Group dataframe.
## ----------------------------------------------------------------------------------------------------------------------------
## Code for block group
block.df <- get_acs(geography = "block group", variables = block.vars, year = 2018, output = "wide", state = state.list)

## Code for tract
#tract.df <- get_acs(geography = "tract", variables = block.vars, year = 2018, output = "wide", state = state.list)

## Code for county
#county.df <- get_acs(geography = "county", variables = block.vars, year = 2018, output = "wide")

#' 
#' Drop error columns and rename columns.
## ----------------------------------------------------------------------------------------------------------------------------
remove.moe <- seq(4,length(block.df),2)

names.block <- c("GEOID","Description","Number.POP","Number.POP.white","Number.POP.black","Number.POP.ai","Number.POP.asian","Number.POP.pi","Number.POP.other","Number.POP.multi","Number.POP.whitealone","Number.POP.latino","Number.HH","Number.HH.white","Number.HH.black","Number.HH.ai","Number.HH.asian","Number.HH.pi","Number.HH.other","Number.HH.multi","Number.HH.whitealone","Number.HH.latino","Median.HH","Median.HH.white","Median.HH.black","Median.HH.ai","Median.HH.asian","Median.HH.pi","Median.HH.other","Median.HH.multi","Median.HH.whitealone","Median.HH.latino")

## Code for block group
block.df <- block.df[,-remove.moe]
colnames(block.df)[1:ncol(block.df)] <- names.block

## Code for tract
tract.df <- tract.df[,-remove.moe]
colnames(tract.df)[1:ncol(tract.df)] <- names.block


#' 
#' Read in urban areas data. We used Max Masnick's work which finds the Census blocks in urban areas (unverified by Max). The read-in csv comes from Max Masnick's GitHub. His work is at the block level. Since we are using block groups, we drop the last 3 characters of the GEOID string.
#' Max describes what he did here: https://gis.stackexchange.com/questions/143829/identify-census-blocks-inside-urban-areas-using-pyshp-or-similar
#' Max's GitHub main page is here: https://github.com/masnick/census-blocks-in-urban-areas
#' Max's released output is here: https://github.com/masnick/census-blocks-in-urban-areas/releases
## ----------------------------------------------------------------------------------------------------------------------------
## Code for block group
urban.block.df <- read.csv(file.path(DATA.PATH,"blocks_with_population.csv"))
urban.block.df$GEOID <- substr(urban.block.df$block_geoid,1,12)
urban.block.df$Urban.Indicator <- 1
urban.block.df <- urban.block.df[,-c(1:(length(urban.block.df)-2))]
urban.block.df <- distinct(urban.block.df)

## Code for tract
urban.block.df$Tract.GEOID <- substr(urban.block.df$GEOID,1,11)
urban.tract.df <- urban.block.df[,c(3,2)]
colnames(urban.tract.df)[1] <- 'GEOID'
urban.tract.df <- distinct(urban.tract.df)


#' 
#' Left join our urban.block.df to block.df and change the block groups that have "NA" in the Urban Indicator column to "0".
## ----------------------------------------------------------------------------------------------------------------------------
## Code for block group
block.df <- left_join(x=block.df,y=urban.block.df,by="GEOID")
block.df$Urban.Indicator[is.na(block.df$Urban.Indicator)] <- 0

## Code for tract
tract.df <- left_join(x=tract.df,y=urban.tract.df,by='GEOID')
tract.df$Urban.Indicator[is.na(tract.df$Urban.Indicator)] <- 0

write.csv(tract.df,file.path(DATA.PROCESSED,"ACS2018_tract_raceincomes_withNAs.csv")) #this dataframe has NAs in the Median Income for some races

#' 
#' Create decile dummies for Median Household Income.
## ----------------------------------------------------------------------------------------------------------------------------
block.df <- block.df %>% mutate(Median.decile = ntile(Median.HH,10))

library(fastDummies)
block.df <- dummy_cols(block.df,select_columns = 'Median.decile')

#' 
#' Write block.df dataframe to file.
## ----------------------------------------------------------------------------------------------------------------------------
write.csv(block.df,file.path(DATA.PROCESSED,"ACS2018_blockgroup.csv"))

#' 
#' Assign values to race-level Median Income 'NAs' in the tract dataframe according to the Median income for all HHs in the tract.
## ----------------------------------------------------------------------------------------------------------------------------
tract2.df <- tract.df
tract2.df <- tract2.df %>% mutate(Median.HH.white = coalesce(Median.HH.white, Median.HH))
tract2.df <- tract2.df %>% mutate(Median.HH.black = coalesce(Median.HH.black, Median.HH))
tract2.df <- tract2.df %>% mutate(Median.HH.ai = coalesce(Median.HH.ai, Median.HH))
tract2.df <- tract2.df %>% mutate(Median.HH.asian = coalesce(Median.HH.asian, Median.HH))
tract2.df <- tract2.df %>% mutate(Median.HH.pi = coalesce(Median.HH.pi, Median.HH))
tract2.df <- tract2.df %>% mutate(Median.HH.other = coalesce(Median.HH.other, Median.HH))
tract2.df <- tract2.df %>% mutate(Median.HH.multi = coalesce(Median.HH.multi, Median.HH))
tract2.df <- tract2.df %>% mutate(Median.HH.whitealone = coalesce(Median.HH.whitealone, Median.HH))
tract2.df <- tract2.df %>% mutate(Median.HH.latino = coalesce(Median.HH.latino, Median.HH))

write.csv(tract2.df,file.path(DATA.PROCESSED,"ACS2018_tract_raceincomes_noNAs.csv")) #this dataframe has NAs in the Median Income replaced by the Median HH income of the tract

#' 
#' 
#' ## ############################################################### ##
#' ## Create county-level aggregated data (by race and income decile) ##
#' ## ############################################################### ##
#' 
#' Substring GEOID to get County codes.
## ----------------------------------------------------------------------------------------------------------------------------
block.df$County <- substr(block.df$GEOID,1,5)

#' 
#' Create the County dataframe and sum population totals by County Code from the block.df dataframe.
## ----------------------------------------------------------------------------------------------------------------------------
county.df <- block.df %>% group_by(County) %>% summarise(County.POP = sum(Number.POP), County.POP.white = sum(Number.POP.white), County.POP.black = sum(Number.POP.black), County.POP.ai = sum(Number.POP.ai), County.POP.asian = sum(Number.POP.asian), County.POP.pi = sum(Number.POP.pi), County.POP.other = sum(Number.POP.other), County.POP.multi = sum(Number.POP.multi), County.POP.whitealone = sum(Number.POP.whitealone), County.POP.latino = sum(Number.POP.latino)) 

#' 
#' Create a secondary County dataframe and sum population totals by decile and by County code from the block.df dataframe.
## ----------------------------------------------------------------------------------------------------------------------------
county.decile.df <- block.df %>% group_by(County,Median.decile) %>% summarise(County.POP = sum(Number.POP))
county.decile.df <- spread(county.decile.df,key = 'Median.decile',value = 'County.POP')
names.county.decile <- c('County','County.POP.decile1','County.POP.decile2','County.POP.decile3','County.POP.decile4','County.POP.decile5','County.POP.decile6','County.POP.decile7','County.POP.decile8','County.POP.decile9','County.POP.decile10','County.POP.decileNA')

colnames(county.decile.df) <- names.county.decile
county.decile.df[is.na(county.decile.df)] <- 0

#' 
#' Join the county.df and county.decile.df dataframes.
## ----------------------------------------------------------------------------------------------------------------------------
county.df <- left_join(x = county.df, y = county.decile.df, by = 'County')

#' 
#' Write county.df dataframe to file.
## ----------------------------------------------------------------------------------------------------------------------------
write.csv(county.df,file.path(DATA.PROCESSED,"ACS2018_countyaggregation.csv"))

#' 
#' 
#' ## ############################################################## ##
#' ## Create ZCTA5-level aggregated data (by race and income decile) ##
#' ## ############################################################## ##
#' 
#' Load 2010 ZCTA to Census Tract Relationship File.
#' Website: http://www2.census.gov/geo/docs/maps-data/data/rel/zcta_tract_rel_10.txt
## ----------------------------------------------------------------------------------------------------------------------------
tracts <- read.csv(file.path(DATA.PATH,"zcta_tract_rel_10.txt"), sep=",", header = TRUE, colClasses=c("ZCTA5"="character","STATE"="character","COUNTY"="character","TRACT"="character","GEOID"="character"))

#' 
#' Substring GEOID to get Tract codes.
## ----------------------------------------------------------------------------------------------------------------------------
block.df$Tract <- substr(block.df$GEOID,1,11)

#' 
#' Create the Tract dataframe and sum population totals by Tract Code from the block.df dataframe.
## ----------------------------------------------------------------------------------------------------------------------------
tract.df <- block.df %>% group_by(Tract) %>% summarise(Tract.POP = sum(Number.POP), Tract.POP.white = sum(Number.POP.white), Tract.POP.black = sum(Number.POP.black), Tract.POP.ai = sum(Number.POP.ai), Tract.POP.asian = sum(Number.POP.asian), Tract.POP.pi = sum(Number.POP.pi), Tract.POP.other = sum(Number.POP.other), Tract.POP.multi = sum(Number.POP.multi), Tract.POP.whitealone = sum(Number.POP.whitealone), Tract.POP.latino = sum(Number.POP.latino)) 

#' 
#' Create a secondary Tract dataframe and sum population totals by decile and by Tract code from the block.df dataframe.
## ----------------------------------------------------------------------------------------------------------------------------
tract.decile.df <- block.df %>% group_by(Tract,Median.decile) %>% summarise(Tract.POP = sum(Number.POP))
tract.decile.df <- spread(tract.decile.df,key = 'Median.decile',value = 'Tract.POP')
names.tract.decile <- c('Tract','Tract.POP.decile1','Tract.POP.decile2','Tract.POP.decile3','Tract.POP.decile4','Tract.POP.decile5','Tract.POP.decile6','Tract.POP.decile7','Tract.POP.decile8','Tract.POP.decile9','Tract.POP.decile10','Tract.POP.decileNA')

colnames(tract.decile.df) <- names.tract.decile
tract.decile.df[is.na(tract.decile.df)] <- 0

#' 
#' Join the county.df and county.decile.df dataframes.
## ----------------------------------------------------------------------------------------------------------------------------
tract.df <- left_join(x = tract.df, y = tract.decile.df, by = 'Tract')

#' 
#' Write county.df dataframe to file.
## ----------------------------------------------------------------------------------------------------------------------------
write.csv(tract.df,file.path(DATA.PROCESSED,"ACS2018_tractaggregation.csv"))

#' 
#' Load 2010 ZCTA to Census Tract Relationship File.
#' Website: http://www2.census.gov/geo/docs/maps-data/data/rel/zcta_tract_rel_10.txt
## ----------------------------------------------------------------------------------------------------------------------------
zip.to.tracts <- read.csv(file.path(DATA.PATH,"zcta_tract_rel_10.txt"), sep=",", header = TRUE, colClasses=c("ZCTA5"="character","STATE"="character","COUNTY"="character","TRACT"="character","GEOID"="character"))

#' 
#' Subset zip.to.tracts and then left join to tract.df dataframe.
## ----------------------------------------------------------------------------------------------------------------------------
zip.to.tracts <- zip.to.tracts[,c(1,5,22)]

colnames(zip.to.tracts)[2] <- 'Tract'

zip.df <- left_join(x= zip.to.tracts, y = tract.df, by = 'Tract')

#' 
#' Create ZCTA5-level columns for each variable of interest. Use the "TRPOPPCT" variable as a multiplier.
#' TRPOPPCT: The percentage of Total Population of the 2010 Census Tract represented by the record (ZCTA5).
## ----------------------------------------------------------------------------------------------------------------------------
zip.df$Zip.POP <- zip.df$TRPOPPCT * zip.df$Tract.POP / 100
zip.df$Zip.POP.white <- zip.df$TRPOPPCT * zip.df$Tract.POP.white / 100
zip.df$Zip.POP.black <- zip.df$TRPOPPCT * zip.df$Tract.POP.black / 100
zip.df$Zip.POP.ai <- zip.df$TRPOPPCT * zip.df$Tract.POP.ai / 100
zip.df$Zip.POP.asian <- zip.df$TRPOPPCT * zip.df$Tract.POP.asian / 100
zip.df$Zip.POP.pi <- zip.df$TRPOPPCT * zip.df$Tract.POP.pi / 100
zip.df$Zip.POP.other <- zip.df$TRPOPPCT * zip.df$Tract.POP.other / 100
zip.df$Zip.POP.multi <- zip.df$TRPOPPCT * zip.df$Tract.POP.multi / 100
zip.df$Zip.POP.whitealone <- zip.df$TRPOPPCT * zip.df$Tract.POP.whitealone / 100
zip.df$Zip.POP.latino <- zip.df$TRPOPPCT * zip.df$Tract.POP.latino / 100

zip.df$Zip.POP.decile1 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile1 / 100
zip.df$Zip.POP.decile2 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile2 / 100
zip.df$Zip.POP.decile3 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile3 / 100
zip.df$Zip.POP.decile4 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile4 / 100
zip.df$Zip.POP.decile5 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile5 / 100
zip.df$Zip.POP.decile6 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile6 / 100
zip.df$Zip.POP.decile7 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile7 / 100
zip.df$Zip.POP.decile8 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile8 / 100
zip.df$Zip.POP.decile9 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile9 / 100
zip.df$Zip.POP.decile10 <- zip.df$TRPOPPCT * zip.df$Tract.POP.decile10 / 100
zip.df$Zip.POP.decileNA <- zip.df$TRPOPPCT * zip.df$Tract.POP.decileNA / 100


#' 
#' Subset the zip.df to have just ZCTA5-level data. Aggregate data to the ZCTA5.
## ----------------------------------------------------------------------------------------------------------------------------
zip.df <- zip.df[,-c(2:24)]
zip.df[is.na(zip.df)] <- 0
zip.df <- aggregate(zip.df[,-1], by = list(zip.df$ZCTA5), FUN = sum)
colnames(zip.df)[1] <- 'ZCTA5'

#' 
#' Write zip.df dataframe to file.
## ----------------------------------------------------------------------------------------------------------------------------
write.csv(zip.df,file.path(DATA.PROCESSED,"ACS2018_zipaggregation.csv"))

#' 
